In this section, we ask the question: “How does the amount of arable land (or geographic information more generally) relate to selected quality of life indicators?”
The Geographic information we have is:
[Info here]
The Quality of Life indicators are:
[Info here]
We begin the analysis by reading in the data (and filtering the relevant columns) as is done in the following R code.
# Function to display the columns and nan counts
col_names_and_nas <- function(df) {
for (i in seq_along(df)) {
cat(colnames(df)[i], ":", sum(is.na(df[[i]])), "\n")
}
}
# read data
full_df <- read.csv("../data/clean_data.csv", header = TRUE)
# Trim data
cols_to_keep <- c("Geography..Map.references", "Geography..Geographic.coordinates", "Geography..Area...land", "Geography..Area...total", "Geography..Climate", "Geography..Land.use...agricultural.land", "Economy..Real.GDP.per.capita", "People.and.Society..Population...total", "People.and.Society..Population.growth.rate", "People.and.Society..Birth.rate", "People.and.Society..Death.rate", "People.and.Society..Maternal.mortality.ratio", "People.and.Society..Infant.mortality.rate...total", "People.and.Society..Life.expectancy.at.birth...total.population" ,"People.and.Society..Physician.density" ,"Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants", "Communications..Internet.users...percent.of.population")
trim_df <- full_df[,cols_to_keep]
trim_df$Country <- full_df[,1]
rownames(trim_df) <- full_df[,1]
We now concern ourselves with the cleaning of the selected columns as many have strings mixed in with numeric values.
library(stringr)
# Geography..Area...land
trim_df$Geography..Area...land <- as.numeric(gsub(",", "", str_extract(trim_df$Geography..Area...land, "[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "Geography..Area...land"] <- "Land.sqkm"
# Geography..Area...total
trim_df$Geography..Area...total <- as.numeric(gsub(",", "", str_extract(trim_df$Geography..Area...total, "[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "Geography..Area...total"] <- "Total.sqkm"
# Geography..Climate
trim_df$Geography..Climate <- str_extract(trim_df$Geography..Climate, "^[^;]+")
colnames(trim_df)[colnames(trim_df) == "Geography..Climate"] <- "Climate"
# Geography..Land.use...agricultural.land
trim_df$Geography..Land.use...agricultural.land <- as.numeric(sub("%", "", str_extract(trim_df$Geography..Land.use...agricultural.land, "[0-9.]+%")))/100
colnames(trim_df)[colnames(trim_df) == "Geography..Land.use...agricultural.land"] <- "Arable.land.percent"
# Economy..Real.GDP.per.capita
trim_df$Economy..Real.GDP.per.capita <- as.numeric(gsub("[$,]", "", str_extract(trim_df$Economy..Real.GDP.per.capita, "\\$[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "Economy..Real.GDP.per.capita"] <- "Real.GDP.per.capita.USD"
# People.and.Society..Population.growth.rate
trim_df$People.and.Society..Population.growth.rate <- as.numeric(sub("%", "", str_extract(trim_df$People.and.Society..Population.growth.rate, "[0-9.]+%"))) / 100
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Population.growth.rate"] <- "Population.growth.rate.percent"
# People.and.Society..Birth.rate - births
trim_df$People.and.Society..Birth.rate <- as.numeric(str_extract(trim_df$People.and.Society..Birth.rate, "^[0-9]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Birth.rate"] <- "Live.births.per.1000.people"
# People.and.Society..Death.rate - deaths per person
trim_df$People.and.Society..Death.rate <- as.numeric(str_extract(trim_df$People.and.Society..Death.rate, "^[0-9]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Death.rate"] <- "Deaths.per.1000.people"
# People.and.Society..Maternal.mortality.ratio - deaths per live birth
trim_df$People.and.Society..Maternal.mortality.ratio <- as.numeric(gsub(",", "", str_extract(trim_df$People.and.Society..Maternal.mortality.ratio, "^[0-9]+"))) / 100000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Maternal.mortality.ratio"] <- "Maternal.deaths.per.100000.live.births"
# People.and.Society..Infant.mortality.rate...total - deaths per live birth
trim_df$People.and.Society..Infant.mortality.rate...total <- as.numeric(str_extract(trim_df$People.and.Society..Infant.mortality.rate...total, "^[0-9.]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Infant.mortality.rate...total"] <- "Infant.deaths.per.1000.live.births"
# People.and.society..Life.expectancy.at.birth...total.population
trim_df$People.and.Society..Life.expectancy.at.birth...total.population <- as.numeric(str_extract(trim_df$People.and.Society..Life.expectancy.at.birth...total.population, "^[0-9.]+"))
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Life.expectancy.at.birth...total.population"] <- "Life.expectancy.at.birth"
# People.and.Society..Physician.density
trim_df$People.and.Society..Physician.density <- as.numeric(str_extract(trim_df$People.and.Society..Physician.density, "^[0-9.]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Physician.density"] <- "Physicians.per.1000.people"
# Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants
trim_df$Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants <- as.numeric(str_extract(trim_df$Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants, "^[0-9]+")) / 100
colnames(trim_df)[colnames(trim_df) == "Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants"] <- "Avg.cellphone.subscriptions.per.person"
# Communications..Internet.users...percent.of.population
trim_df$Communications..Internet.users...percent.of.population <- as.numeric(str_extract(trim_df$Communications..Internet.users...percent.of.population, "^[0-9.]+")) / 100
colnames(trim_df)[colnames(trim_df) == "Communications..Internet.users...percent.of.population"] <- "Internet.users.percent.of.population"
# People.and.Society..Population...total
trim_df$People.and.Society..Population...total <- as.numeric(gsub(",", "", str_extract(trim_df$People.and.Society..Population...total, "^[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Population...total"] <- "Total.population"
# Geography..Map.references
colnames(trim_df)[colnames(trim_df) == "Geography..Map.references"] <- "Continent"
# Note, you must re-run the whole notebook to not have issues with the nan overwriting
Note that we have removed entries that may not be countries so our numbers may not reflect those of other surveys. The preprocessing is however, accessible.
Let us begin by examining total and arable land by continent.
The continents we examine are:
continents <- unlist(unique(trim_df$Continent))
continents
## [1] "Asia" "Europe"
## [3] "Africa" "Central America and the Caribbean"
## [5] "South America" "Oceania"
## [7] "Middle East" "North America"
## [9] "Southeast Asia"
# Land per continent
continent_land <- trim_df %>%
group_by(Continent) %>%
summarise(TotalLand = sum(Land.sqkm, na.rm = TRUE))
# Arable land per continent
continent_arable_land <- trim_df %>%
mutate(ArableLand = (Arable.land.percent) * Land.sqkm) %>%
group_by(Continent) %>%
summarise(TotalArableLand = sum(ArableLand, na.rm = TRUE))
# Combine into long format
combined <- left_join(continent_land, continent_arable_land, by="Continent") %>%
pivot_longer(cols = c(TotalLand, TotalArableLand),
names_to = "Type",
values_to = "Area") %>%
mutate(Type = recode(Type,
TotalLand = "Total Land",
TotalArableLand = "Arable Land"))
# Sort
continent_order <- combined %>%
filter(Type == "Total Land") %>%
arrange(desc(Area)) %>%
pull(Continent)
combined$Continent <- factor(combined$Continent, levels=continent_order)
# Plot - skqm of land and arable land
ggplot(combined, aes(x = Continent, y = Area, fill = Type)) +
geom_bar(stat = "identity", position = position_dodge(width=0.8), width = 0.7, color = "black") +
scale_y_continuous(labels= comma) +
labs(
title = "Land vs. Arable Land Area by Region",
x = "Region",
y = "Area (sq. km)",
fill = "Land Type"
) +
theme_minimal(base_size=14) +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
scale_fill_brewer(palette="Set1")
# Compute the proportion of arable land
continent_proportion <- trim_df %>%
mutate(ArableLand = (Arable.land.percent) * Land.sqkm) %>%
group_by(Continent) %>%
summarise(
TotalLand = sum(Land.sqkm, na.rm = TRUE),
TotalArableLand = sum(ArableLand, na.rm = TRUE)
) %>%
mutate(Proportion = TotalArableLand / TotalLand)
# Sort by proportion
continent_proportion$Continent <- factor(
continent_proportion$Continent,
levels = continent_proportion$Continent[order(-continent_proportion$Proportion)]
)
# Plot - sqkm of arable land ratio
ggplot(continent_proportion, aes(x = Continent, y = Proportion, fill = Continent)) +
geom_bar(stat = "identity", width = 0.7, color = "black") +
scale_y_continuous(labels= percent_format(accuracy = 1)) +
labs(
title = "Proportion of Arable Land by Region",
x = "Region",
y = "Arable Land (% of Total Land)"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle=45, hjust=1),
legend.position = "none"
) +
scale_fill_brewer(palette = "Paired")
At this point in the analysis we look at a few quality of life indicators including Real GDP per Capita and some other statistics concerning public health, again per region. (These should by put in one side-by-side analysis)
# Function to generate a bar chart for a specific numeric variable
plot_bar_chart <- function(data, variable, group_var = "Continent", agg_fun = mean, palette = "Paired", title = NULL, y_label = NULL, x_label = "Region") {
# Check if the column exists in the dataframe
if(!(variable %in% colnames(data))) {
stop(paste("Column", variable, "not found in the dataframe"))
}
# Create a summarized dataset by the chosen grouping variable
summary_data <- data %>%
group_by(.data[[group_var]]) %>%
summarise(Aggregate = agg_fun(.data[[variable]], na.rm = TRUE)) %>%
arrange(desc(Aggregate))
# Define title and y-label if not provided
if (is.null(title)) title <- paste("Average", variable, "by", group_var)
if (is.null(y_label)) y_label <- paste("Average", variable)
# Plot the bar chart
ggplot(summary_data, aes(x = reorder(.data[[group_var]], -Aggregate), y = Aggregate, fill = .data[[group_var]])) +
geom_bar(stat = "identity", width = 0.7, color = "black") +
scale_y_continuous(labels = scales::comma) +
labs(
title = title,
x = x_label,
y = y_label
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
scale_fill_brewer(palette = "Paired")
}
Average real GDP per capita per region:
plot_bar_chart(trim_df, "Real.GDP.per.capita.USD", title= "Average Real GDP per Capita by Region", y_label="Real GDP per Capita (2021 USD)", x_label="Region")
Average number of physicians rate, “Physicians.per.1000.people”:
plot_bar_chart(trim_df, "Physicians.per.1000.people", title= "Average No. Physicians per 1000 People", y_label="No. Physicians per 1000 People", x_label="Region")
Average infant mortality rate, “Infant.deaths.per.1000.live.births”:
plot_bar_chart(trim_df, "Infant.deaths.per.1000.live.births", title= "Average Infant Mortality Rate", y_label="Infant Deaths per 1000 Live Births", x_label="Region")
Average maternal mortality rate, “Maternal.deaths.per.100000.live.births”:
plot_bar_chart(trim_df, "Maternal.deaths.per.100000.live.births", title= "Average Maternal Mortality Rate", y_label="Maternal Deaths per 100,000 live Births", x_label="Region")
Average “Life.expectancy.at.birth”:
plot_bar_chart(trim_df, "Life.expectancy.at.birth", title= "Average Life expectancy at birth", y_label="Years After Birth", x_label="Region")
Average death rate, “Deaths.per.1000.people”:
plot_bar_chart(trim_df, "Deaths.per.1000.people", title= "Average Death Rate", y_label="Deaths per 1000 People", x_label="Region")
Average birth rate, “Live.births.per.1000.people”:
plot_bar_chart(trim_df, "Live.births.per.1000.people", title= "Average Birth Rate", y_label="Live Births per 1000 people", x_label="Region")
Average “Population.growth.rate.percent”:
plot_bar_chart(trim_df, "Population.growth.rate.percent", title= "Average Population Growth Rate", y_label="Percent Growth Rate", x_label="Region")
Total population, “Population”:
plot_bar_chart(trim_df, "Total.population", title= "Total Population", agg_fun=sum, y_label="No. of People", x_label="Region")
Note that the model of exponential population growth doesn’t totally work here, see Africa and Asia. The failing assumption is likely gender balance.
Average percent of population online, “Internet.users.percent.of.population”:
plot_bar_chart(trim_df, "Internet.users.percent.of.population", title= "Average Proportion of Population Online", y_label="Proportion of Internet Users", x_label="Region")
Average number of cellphone subscriptions per inhabitants, “Avg.cellphone.subscriptions.per.person”:
plot_bar_chart(trim_df, "Avg.cellphone.subscriptions.per.person", title= "Average Number of Cellphone Subscriptions per Inhabitant", y_label="Subscriptions", x_label="Region")
Now we will do principal components analysis on the numeric-continuous variables visualized above and visualize the biplot. (code the datapoints by region – maybe cluster if it looks worthwhile)
Aggregate visualized numeric-continuous columns (Note we remove some with the na.omit command)
numeric_df <- trim_df[,c("Total.sqkm", "Arable.land.percent", "Real.GDP.per.capita.USD", "Physicians.per.1000.people", "Infant.deaths.per.1000.live.births", "Maternal.deaths.per.100000.live.births", "Life.expectancy.at.birth", "Deaths.per.1000.people", "Live.births.per.1000.people", "Population.growth.rate.percent", "Total.population", "Internet.users.percent.of.population", "Avg.cellphone.subscriptions.per.person")]
Perform PCA:
pca_result <- prcomp(na.omit(numeric_df), center=TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.5430 1.2301 1.1545 0.92118 0.80601 0.77879 0.70818
## Proportion of Variance 0.4975 0.1164 0.1025 0.06527 0.04997 0.04666 0.03858
## Cumulative Proportion 0.4975 0.6139 0.7164 0.78165 0.83162 0.87828 0.91686
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.59083 0.49590 0.43754 0.41600 0.25634 0.23588
## Proportion of Variance 0.02685 0.01892 0.01473 0.01331 0.00505 0.00428
## Cumulative Proportion 0.94371 0.96263 0.97735 0.99067 0.99572 1.00000
raw_variance <- pca_result$sdev^2
pve <- raw_variance / sum(raw_variance)
cumulative_pve = cumsum(pve)
pca_performance_data <- data.frame(
PC = factor(1:length(pve), levels = 1:length(pve)),
RawVariance = raw_variance,
PVE = pve,
CumulativePVE = cumulative_pve
)
PCA Variances Plot:
ggplot(pca_performance_data, aes(x=PC, y=RawVariance)) +
geom_line(aes(group=1), color="red", size=1) +
geom_point(aes(color="red"), size=3)+
geom_hline(yintercept=1, linetype="dotted", color="black")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
labs(
title = "Scree Plot (Raw Variance)",
x = "Principal Components",
y = "Raw Variance"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
## NULL
Cumulative Proportion of Variance Explained (CPVE) plot:
ggplot(pca_performance_data, aes(x = PC, y = CumulativePVE, group=1)) +
geom_line(color="blue", size=1) +
geom_point(aes(color="blue"), size=3) +
geom_hline(yintercept=0.9, linetype="dotted", color="black")
labs(
title = "Cumulative Proportion of Variance Explained (PVE)",
x = "Principal Components",
y = "Cumulative PVE"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## NULL
Here we plot the loading vectors of the PCA on the axes of the first and second principal components.
# Create dataframe for loadings
loadings_df <- as.data.frame(pca_result$rotation[, 1:2])
loadings_df$varname <- rownames(loadings_df)
ggplot(loadings_df, aes(x = PC1, y = PC2)) +
geom_segment(aes(x = 0, y = 0, xend = PC1, yend = PC2),
arrow = arrow(length = unit(0.2,"cm")), color = "blue") +
geom_text_repel(aes(label = varname), size = 3, max.overlaps = 100) +
labs(title = "PCA Biplot: Loadings Only") +
theme_minimal()
To interpret the principal components,we say the following:
PC2 appears to contain information regarding country size in both population and land area, large values indicating larger countries. The only “out of place” variable contained in this is the “Deaths.per.1000.people” variable.
PC1 appears to represent the developing status of a nation with larger scores being given to more developed nations.
We can see something of a contrast between the variables of “Physicians.per.1000.people” and “Population.growth.rate.percent” along the line: \(\text{PC}_2 = \frac{1}{2}\text{PC}_1\). From this one may interpret, countries with a higher rate of physicians in the population have a smaller population growth rate.
Here we plot the most extreme observations defined: \[\text{extreme_score}_i = \sqrt{\text{PC}1_i^2 + \text{PC}2_i^2}\]
meta_df <- trim_df[, c("Country", "Continent")]
# Get PCA scores (observations)
scores_df <- as.data.frame(pca_result$x[, 1:2])
scores_df$Country <- rownames(scores_df)
# Add distance from origin in PC1-PC2 space
scores_df <- scores_df %>%
mutate(Distance = sqrt(PC1^2 + PC2^2))
# Keep top 10 extreme countries - euclidean norm of first two principal components
top_extremes <- scores_df %>%
arrange(desc(Distance)) %>%
slice_head(n = 50)
top_extremes <- top_extremes %>%
left_join(meta_df, by = "Country")
# Plot
ggplot(top_extremes, aes(x = PC1, y = PC2, label = Country, color = Continent)) +
geom_point(size = 2) +
geom_text_repel(
size = 2,
max.overlaps = Inf, # allow all to be plotted
force = 2, # increase repulsion strength
force_pull = 0.1, # balance between point and label
box.padding = 0.5, # more space around text
point.padding = 0.3, # space between text and point
segment.color = "grey50", # optional connector color
segment.size = 0.4
) +
geom_hline(yintercept=0, linetype="dotted", color="black") +
geom_vline(xintercept=0, linetype="dotted", color="black") +
labs(title = "Top 50 Most Extreme Countries in PC1–PC2 Space",
x = "PC1", y = "PC2") +
theme_minimal() +
scale_color_brewer(palette = "Paired")
# Get scores
pc_scores <- as.data.frame(pca_result$x[, 1:2])
colnames(pc_scores) <- c("PC1", "PC2")
# Define a symmetric range around (0,0)
max_abs <- max(abs(pc_scores$PC1), abs(pc_scores$PC2))
grid_limit <- ceiling(max_abs) + 1 # Pad
# 2D KDE
kde <- kde2d(pc_scores$PC1, pc_scores$PC2,
n=100, lims= c(-grid_limit, grid_limit, -grid_limit, grid_limit))
# Interactive 3D surface plot
plot_ly(
x=kde$x,
y=kde$y,
z=kde$z,
type="surface"
) %>%
layout(
title="Interactive 3D KDE of PC1-PC2",
scene = list(
xaxis=list(title="PC1 (x)", range=c(-grid_limit, grid_limit)),
yaxis=list(title="PC2 (y)", range=c(-grid_limit, grid_limit)),
zaxis=list(title="Density")
)
)
Here we examine the pairs plot of all continuous variables.
numeric_df_normalized <- scale(numeric_df)
colnames(numeric_df_normalized) <- c("Land", "Ag. Land", "GDP/capita", "Doc. Rate", "Inf. Mort.", "Mat. Mort.", "Life Expect.", "Death Rate", "Birth Rate", "Pop. Growth", "Tot. Pop.", "Internet Use", "Cellphones")
# Set graphical parameters to adjust the size of the boxes
old_par <- par(no.readonly = TRUE) # Save the original plot parameters
par(mar = c(8,8,3,3)) # Modify margins to make the plot larger (top, left, bottom, right)
pairs(numeric_df_normalized,
labels = colnames(numeric_df_normalized),
cex.labels = 0.8, # label size
font.labels = 2, # bold
gap = 1, # spacing between plots and labels
upper.panel = NULL,
pch = 21,
bg = "lightblue") # optional: simplify upper triangle
par(old_par)
Variables Examined:
“Total.sqkm”
“Real.GDP.per.capita.USD”
“Life.expectancy.at.birth”
“Total.population”
Statistics Considered:
Minimum
Maximum
star_data <- trim_df[, c("Total.sqkm", "Real.GDP.per.capita.USD", "Life.expectancy.at.birth", "Total.population")]
rownames(star_data) <- trim_df$Country
# For continent-wise plotting
agg_star_data <- aggregate(
star_data,
by = list(Continent = trim_df$Continent),
FUN = min,
na.rm = TRUE
)
rownames(agg_star_data) <- agg_star_data$Continent
agg_star_data$Continent <- NULL # remove redundant column
agg_star_data_scaled <- scale(agg_star_data)
colnames(agg_star_data_scaled) <- c("Sq. km.", "GDP per capita", "Life Expectancy", "Total Pop.")
# For country-wise plotting
star_data_scaled <- scale(star_data)
colnames(star_data_scaled) <- c("Sq. km.", "GDP per capita", "Life Expectancy", "Total Pop.")
The full star plot is available here